Elizabeth Bekele, Alison Cheek
2022-05-03
Import the deaths-due-to-air-pollution data
We are going to rename a few of the columns and glimpse the data
colnames(deaths_df) <- c("country", "acronym", "year", "total_deaths", "indoor_deaths", "outdoor_deaths", "ozone_deaths")
glimpse(deaths_df)## Rows: 6,468
## Columns: 7
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanist…
## $ acronym <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",…
## $ year <int> 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1…
## $ total_deaths <dbl> 299.4773, 291.2780, 278.9631, 278.7908, 287.1629, 288.0…
## $ indoor_deaths <dbl> 250.3629, 242.5751, 232.0439, 231.6481, 238.8372, 239.9…
## $ outdoor_deaths <dbl> 46.44659, 46.03384, 44.24377, 44.44015, 45.59433, 45.36…
## $ ozone_deaths <dbl> 5.616442, 5.603960, 5.611822, 5.655266, 5.718922, 5.739…
Variables that interest us here include:
Now, let’s take a look at the population data.
## Rows: 12,595
## Columns: 3
## $ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "Andorra", "…
## $ Year <int> 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 196…
## $ Count <int> 54211, 8996973, 5454933, 1608800, 13411, 92418, 20481779,…
To get a general idea of ‘deaths-dataframe’ we made, let’s make a plots to see what’s happening. This is a plot of indoor x outdoor deaths around the world by country.
d <- ggplot(deaths_df, aes(x = indoor_deaths, y = outdoor_deaths, text = paste0(country, ", ", year) )) + geom_point() +
ggtitle("Outdoor Deaths vs Indoor Deaths")
ggplotly(d)This is a mess, and so we chose two countries from each continent (a high-population and a low-population country) to graph.
First let’s look at a table of the high and low populated countries using the world population data set.
#selecting high-population countries from the world population data frame
high_pop_countries <- world_pop %>%
filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>%
group_by(Year)
head(high_pop_countries)## # A tibble: 6 × 3
## # Groups: Year [1]
## Country.Name Year Count
## <chr> <int> <int>
## 1 Australia 1997 18517000
## 2 Brazil 1997 167209040
## 3 Germany 1997 82034771
## 4 Nigeria 1997 113457663
## 5 Pakistan 1997 131057431
## 6 United States 1997 272657000
#selecting low-population countries from the world population data frame
low_pop_countries <- world_pop %>%
filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>%
group_by(Year)
head(low_pop_countries)## # A tibble: 6 × 3
## # Groups: Year [1]
## Country.Name Year Count
## <chr> <int> <int>
## 1 Canada 1997 29905948
## 2 Chile 1997 14786220
## 3 Sri Lanka 1997 18470900
## 4 Malawi 1997 10264906
## 5 New Zealand 1997 3781300
## 6 Serbia 1997 7596501
Next, we are going to see the death count for high and low populated countries using the deaths dataframe.
#selecting high-population deaths from death dataframe
high_pop_death <- deaths_df %>%
filter(year > 1996 & country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>%
group_by(year)
head(high_pop_death)## # A tibble: 6 × 7
## # Groups: year [6]
## country acronym year total_deaths indoor_deaths outdoor_deaths ozone_deaths
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Australia AUS 1997 22.4 0.322 21.8 0.314
## 2 Australia AUS 1998 21.5 0.284 21.0 0.305
## 3 Australia AUS 1999 20.4 0.259 19.9 0.295
## 4 Australia AUS 2000 19.4 0.240 18.9 0.290
## 5 Australia AUS 2001 18.6 0.223 18.1 0.284
## 6 Australia AUS 2002 18.1 0.211 17.7 0.286
#selecting low-population deaths from death dataframe
low_pop_death <- deaths_df %>%
filter(year > 1996 & country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>%
group_by(year)
head(low_pop_death)## # A tibble: 6 × 7
## # Groups: year [6]
## country acronym year total_deaths indoor_deaths outdoor_deaths ozone_deaths
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Canada CAN 1997 21.9 0.0878 19.9 2.20
## 2 Canada CAN 1998 21.7 0.0824 19.6 2.21
## 3 Canada CAN 1999 21.2 0.0751 19.2 2.19
## 4 Canada CAN 2000 20.3 0.0682 18.3 2.13
## 5 Canada CAN 2001 19.8 0.0641 17.9 2.08
## 6 Canada CAN 2002 19.5 0.0605 17.7 2.05
Lastly, we will join the population and and deaths with its respected country.
#Combined High Population with Pollution Death
joined_high <- right_join(high_pop_death, high_pop_countries, by= c('country' = 'Country.Name', 'year' = 'Year'))
head(joined_high)## # A tibble: 6 × 8
## # Groups: year [6]
## country acronym year total_deaths indoor_deaths outdoor_deaths ozone_deaths
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Australia AUS 1997 22.4 0.322 21.8 0.314
## 2 Australia AUS 1998 21.5 0.284 21.0 0.305
## 3 Australia AUS 1999 20.4 0.259 19.9 0.295
## 4 Australia AUS 2000 19.4 0.240 18.9 0.290
## 5 Australia AUS 2001 18.6 0.223 18.1 0.284
## 6 Australia AUS 2002 18.1 0.211 17.7 0.286
## # … with 1 more variable: Count <int>
#Combined Low Population with Pollution Death
joined_low <-right_join(low_pop_death, low_pop_countries, by= c('country' = 'Country.Name', 'year' = 'Year'))
head(joined_low) ## # A tibble: 6 × 8
## # Groups: year [6]
## country acronym year total_deaths indoor_deaths outdoor_deaths ozone_deaths
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Canada CAN 1997 21.9 0.0878 19.9 2.20
## 2 Canada CAN 1998 21.7 0.0824 19.6 2.21
## 3 Canada CAN 1999 21.2 0.0751 19.2 2.19
## 4 Canada CAN 2000 20.3 0.0682 18.3 2.13
## 5 Canada CAN 2001 19.8 0.0641 17.9 2.08
## 6 Canada CAN 2002 19.5 0.0605 17.7 2.05
## # … with 1 more variable: Count <int>
Let’s make a table depicting the high and low populated countries and their respected death count due to pollution.
#Mean total deaths of high-population countries
deaths_highpop_countries <- deaths_df %>%
filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>%
group_by(country) %>%
select(total_deaths) %>%
summarize(average_death_high = mean(total_deaths))## Adding missing grouping variables: `country`
#Mean total deaths of high-population countries
deaths_lowpop_countries<- deaths_df %>%
filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>%
group_by(country) %>%
select(total_deaths) %>%
summarize(average_death_low = mean(total_deaths))## Adding missing grouping variables: `country`
|
|
Here’s a graph to clearly visualize the previous table
#Plot High Population Deaths (average)
ggplot(deaths_highpop_countries)+
geom_col(mapping = aes(x=country, y=average_death_high))+
xlab("Country")+
ylab("Average deaths (per 100,000)")+
ggtitle("Average total deaths in high-population countries")+
coord_flip()#Plot of Low Population Deaths (average)
ggplot(deaths_lowpop_countries)+
geom_col(mapping = aes(x=country, y=average_death_low))+
xlab("Country")+
ylab("Average deaths (per 100,000)")+
ggtitle("Average total deaths in low-population countries")+
coord_flip()First, we split the data into high and low population based on country
Low population = high population * .10
#selecting high-population countries from the world population data frame
high_pop_countries <- world_pop %>%
filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>%
group_by(Year)
high_pop_countries## # A tibble: 126 × 3
## # Groups: Year [21]
## Country.Name Year Count
## <chr> <int> <int>
## 1 Australia 1997 18517000
## 2 Brazil 1997 167209040
## 3 Germany 1997 82034771
## 4 Nigeria 1997 113457663
## 5 Pakistan 1997 131057431
## 6 United States 1997 272657000
## 7 Australia 1998 18711000
## 8 Brazil 1998 169785250
## 9 Germany 1998 82047195
## 10 Nigeria 1998 116319759
## # … with 116 more rows
#selecting low-population countries from the world population data frame
low_pop_countries <- world_pop %>%
filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>%
group_by(Year)
low_pop_countries## # A tibble: 126 × 3
## # Groups: Year [21]
## Country.Name Year Count
## <chr> <int> <int>
## 1 Canada 1997 29905948
## 2 Chile 1997 14786220
## 3 Sri Lanka 1997 18470900
## 4 Malawi 1997 10264906
## 5 New Zealand 1997 3781300
## 6 Serbia 1997 7596501
## 7 Canada 1998 30155173
## 8 Chile 1998 14977733
## 9 Sri Lanka 1998 18564599
## 10 Malawi 1998 10552338
## # … with 116 more rows
#Mean total deaths from 1996-2017 of high-population countries
deaths_highpop_countries <- deaths_df %>%
filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>%
group_by(country) %>%
select(total_deaths) %>%
summarize(average_death_high = mean(total_deaths))## Adding missing grouping variables: `country`
#Mean total deaths from 1990-2017 of high-population countries
deaths_lowpop_countries<- deaths_df %>%
filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>%
group_by(country) %>%
select(total_deaths) %>%
summarize(average_death_low = mean(total_deaths))## Adding missing grouping variables: `country`
|
|
ggplot(deaths_highpop_countries)+
geom_col(mapping = aes(x=country, y=average_death_high))+
xlab("Country")+
ylab("Average deaths (per 100,000)")+
ggtitle("Average total deaths in high-population countries")+
coord_flip()ggplot(deaths_lowpop_countries)+
geom_col(mapping = aes(x=country, y=average_death_low))+
xlab("Country")+
ylab("Average deaths (per 100,000)")+
ggtitle("Average total deaths in low-population countries")+
coord_flip()This shows us the deaths due to pollution, but what about the average population of those countries at that time?
hp_countries_population <- world_pop %>%
filter(Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia'), Year > 1996) %>%
group_by(Country.Name) %>%
select(Count) %>%
summarize(average_population = mean(Count))## Adding missing grouping variables: `Country.Name`
#hp_countries_population
lp_countries_population <- world_pop %>%
filter(Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'), Year > 1996) %>%
group_by(Country.Name) %>%
select(Count) %>%
summarize(average_population = mean(Count))## Adding missing grouping variables: `Country.Name`
|
|
#Graph of Population Average
ggplot(hp_countries_population)+
geom_col(mapping = aes(x=Country.Name, y=average_population))+
xlab("Country")+
ylab("Average Population")+
ggtitle("Average high-population countries")+
coord_flip()ggplot(lp_countries_population)+
geom_col(mapping = aes(x=Country.Name, y=average_population))+
xlab("Country")+
ylab("Average Population")+
ggtitle("Average low-population countries")+
coord_flip()#Join the data sets so we can overlay the two graph or do a stacked barchart?
full_join(x, y, by = ):return all rows and all columns
from both x and y. Where there are not matching values, returns NA for
the one missing.